QTM Data Think Summer 2022¶

intro (why we did this project)¶

context (basic visualization of atlanta census tract data)¶

main question¶

data description¶

rental data basic visualization¶

correlation between rental price and location (spline visualization)¶

overview of solutions¶

prediction (with pros and cons of different models, justify model selection. is housing price predictable?)¶

findings (visualization??)¶

clustering (with pros and cons of different models, justify model selection.)¶

findings (visualization of submarkets + discussion of features that categorize "similarity" and corresponding policy suggestions)¶

In [ ]:
from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di
di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)
CSS = """#notebook div.output_subarea {max-width:100%;}""" #changes output_subarea width to 100% (from 100% - 14ex)
HTML('<style>{}</style>'.format(CSS))
Out[ ]:
In [ ]:
from msilib.schema import Icon
from turtle import color
import folium
import pandas as pd
import numpy as np
#specific functions to call
# Marker: interactive point 
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster
# icon options: beautify_icon
# from folium.plugins.beautify_icon import BeautifyIcon

# clustering 好多setup救命
from folium import plugins

#from sklearn.cluster import KMeans
#from sklearn.decomposition import PCA
import matplotlib.cm as cm
import matplotlib.colors as colors

#shp file
import shapefile
from shapely.geometry import Point, Polygon, LineString

def setup(input_name):
    #set up: read in data for df for single family dataframe
    # make df available through the file
    global df
    # for csv file
    if input_name[-1] == "v":
        df = pd.read_csv(input_name)
        df_name=input_name[0:-4]
    # for xlsx file 
    else:
        # for EXCEL | sheet_name可有可无 
        # df = pd.read_excel(input_name,sheet_name="wantedSheetName")
        df = pd.read_excel(input_name)
        df_name=input_name[0:-5]
    # show what options we have
    df.head()
    # basci info for ATL mapping
    # map = folium.Map(location = [33.779191,-84.369],zoom_start =10,prefer_canvas=True)
    return df_name

def setupList(input_name):
    # for csv file
    if input_name[-1] == "v":
        df = pd.read_csv(input_name)
    # for xlsx file 
    else:
        # for EXCEL | sheet_name可有可无 
        df = pd.read_excel(input_name)
    return df

def pop_up_map(df_name):
    # return map with popup information
    # only works well with around 1000 data 
    map1 = folium.Map(location = [33.779191,-84.369],zoom_start =10,prefer_canvas=True)
    index=0
    for idx,row in df.iterrows():
        Marker([row['Latitude'],row['Longitude']],popup=row['Price']).add_to(map1)
        index+=1
        if index == 1000:
            break
    name = "popUpMap_"+df_name+".html"
    map1.save(name)
    return

def heat_map(df_name):
    # return heatmap showing density of data in the area
    # https://zhuanlan.zhihu.com/p/392687123
    map2 = folium.Map(location = [33.779191,-84.369],zoom_start =10,prefer_canvas=True)
    HeatMap(data = df[['Latitude','Longitude']]).add_to(map2)
    name = "heatMap_"+df_name+".html"
    map2.save(name)
    return 

def clustering_map(df_name, ):
    # 能不能加一层description text/ 图例
    # +cluster description: mean price/ why group

    # 如果是距离近的话可以用这类 #这在note什么啊没懂
    # https://zhuanlan.zhihu.com/p/350647526
    map3 = folium.Map(location = [33.779191,-84.369],zoom_start =10,prefer_canvas=True)
    # set color scheme for the clusters
    x = np.arange(num_cluster) #cluster的个数
    ys = [i + x + (i*x)**2 for i in range(num_cluster)]
    colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
    rainbow = [colors.rgb2hex(i) for i in colors_array]
    #zip():挑拣并打包需要用到的col
    for lat, lng, cluster, street in zip(df['Latitude'], df['Longitude'],  df['Cluster_ID'],df['Street']):
        #label = folium.Popup(str(city)+ ','+str(state) + '- Cluster ' + str(cluster), parse_html=True)
        folium.vector_layers.CircleMarker(
            [lat, lng],
            radius=3,# 想要多大个circle
            tooltip = str(street)+',Cluster '+ str(cluster), # more features could be add on if wanted
            color=rainbow[cluster-1],
            fill=True,
            fill_color=rainbow[cluster-1],
            fill_opacity=0.9).add_to(map3)
    name = "clusteringMap_"+df_name+".html"
    map3.save(name)
    return

def test_cluster(df_name):
    # TESTING: 
    # clustering only base on the density of data geographically
    map4 = folium.Map(location = [33.779191,-84.369],zoom_start =10,prefer_canvas=True)
    # create a mark cluster object
    marker_cluster = MarkerCluster().add_to(map4)
    # set color scheme for the clusters
    x = np.arange(5) #cluster的个数
    ys = [i + x + (i*x)**2 for i in range(5)]
    colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
    rainbow = [colors.rgb2hex(i) for i in colors_array]
    #zip():挑拣并打包需要用到的col
    for lat, lng, cluster, street in zip(df['Latitude'], df['Longitude'],  df['Cluster_ID'],df['Street']):
        #label = folium.Popup(str(city)+ ','+str(state) + '- Cluster ' + str(cluster), parse_html=True)
        folium.vector_layers.CircleMarker(
            [lat, lng],
            radius=3,# 想要多大个circle
            tooltip = str(street)+',Cluster '+ str(cluster), # more features could be add on if wanted
            color=rainbow[cluster-1],
            fill=True,
            fill_color=rainbow[cluster-1],
            fill_opacity=0.9).add_to(marker_cluster) # 将这些经纬度数据加入聚类
    # add marker_cluster to map
    map4.add_child(marker_cluster)
    name = "clusteringMapTesting_"+df_name+".html"
    map4.save(name)
    return

def checkR(df_name,num_group):
    # show the level of residual for each location and mark based on the percentile
    map5 = folium.Map(location = [33.779191,-84.369],zoom_start =10,prefer_canvas=True)
    # percentile 要*100 + "%"
    # 用 cirle marker: 标记percentile和residual value
    for lat, lng, error_value, p, error_flag,zc in zip(df['Latitude'], df['Longitude'],  df['Error'],df['ab_percentile'],df['error_flag'],df['Zip']):
            per =  float(p) *100
            if error_flag == 1:
                # check count(1) VS density of housing in the zipcode
                Marker([lat,lng],popup="Over 95%!" +"\n Error Value:"+ str(round(error_value,3))+"\n Percentile:"+str(round(per,3))+"%,"+str(zc)).add_to(map5)
            else:
                # RBG checker
                rainbow =["#DAF7A6","#FFC300", "#FF5733", "#900C3F","#581845"]
                group = int(per//20)
                folium.vector_layers.CircleMarker(
                     [lat, lng],
                     radius=4,# 想要多大个circle
                     tooltip = "Error Value:"+ str(round(error_value,3))+" Percentile:"+str(round(per,3))+"%,"+str(zc), 
                     fill_color = rainbow[group],
                     color = rainbow[group],
                     fill =True,
                     fill_opacity = 0.9).add_to(map5)
    name ="NEWSEEResiCheck_"+df_name+".html"
    map5.save(name)
    return 

def insert_percentileGroup(col):
    # # insert a col with the percentile for each 
    # 求rank 
        # 用跳跃排名 从value大的开始排第一 有可能有重复rank 
    percentile_list = list()
    # 新创一个col and noted as RankNumer
    df['ab_RankNumber'] = df[col].rank(method='min',ascending=False)
        # CHECK: count 一下每组大概return的个数
    # 指定col长度
    w = len(df[col])
    for rRank in df['ab_RankNumber']:
        # checkdensity of 
        percentile = 1-(rRank / w)  
        percentile_list.append(percentile)
    df.insert(df.shape[1], "ab_percentile", percentile_list, True) # 前面number用了一行
    return

def multiLayer():
    inputList = ["SingleFamilyRentalsAtlantaArea_07072022_clustered_k=10.csv",
        "SingleFamilyRentalsAtlantaArea_07072022_clustered_k=9.csv",
        "SingleFamilyRentalsAtlantaArea_07072022_clustered_k=8.csv",
        "SingleFamilyRentalsAtlantaArea_07072022_clustered_k=7.csv",
        "SingleFamilyRentalsAtlantaArea_07072022_clustered_k=6.csv",
        "SingleFamilyRentalsAtlantaArea_07072022_clustered_k=5.csv"]
    multiMap = folium.Map(location = [33.779191,-84.369],zoom_start =10,prefer_canvas=True)
    pointsgroup = folium.FeatureGroup(name='Points_Layer', control=True)
    # set color scheme for the clusters
    x = np.arange(10) #cluster的max个数
    ys = [i + x + (i*x)**2 for i in range(10)]
    colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
    rainbow = [colors.rgb2hex(i) for i in colors_array]
    # setup layers 
    for file in inputList:
        # print(file)
        df=setupList(file) # readin file 但是后面会cover掉
        dfName = file[-7:-4]
        pointsgroup = folium.FeatureGroup(name=dfName, control=True)
        for lat, lng, cluster, mPrice in zip(df['Latitude'], df['Longitude'],  df['Cluster_ID'],df['Mean_Price']):
            folium.vector_layers.CircleMarker(
                [lat, lng],
                radius=3,# 想要多大个circle
                tooltip = 'cluster'+str(cluster)+',cluster mean price:'+ str(mPrice), # more features could be add on if wanted
                color=rainbow[cluster-1],
                fill=True,
                fill_color=rainbow[cluster-1],
                fill_opacity=0.9).add_to(pointsgroup)
        # map and add to multiMap
        multiMap.add_child(pointsgroup)
    folium.LayerControl().add_to(multiMap)  
    return multiMap

def censusPolyShape():
    # add the polygon layer of the social science data from census
    # input map/group and return the map and group done
    # OR take global
    map6 = folium.Map(location = [33.779191,-84.369],zoom_start =10,prefer_canvas=True)

    return # return the layer with census data 

def main():
    # df 是global unique 变量,尽量不要重新指向
    input_name="0725_rf_error_analysis_with_flag.csv"
    df_name=setup(input_name) # ONLY read .CSV OR .XLSX

    # # Options:
    # pop_up_map(df_name)

    # heat_map(df_name)

    # for the residuals 
    # input the selected col and insert the percentile for that col
    insert_percentileGroup("abs_error")

    # num_cluster = 5
    # clustering_map(df_name,num_cluster)

    # multiLayer()

    # test_cluster(df_name)
    checkR(df_name,5)
    # print(df_name)
    # df.info()

# main()
map= multiLayer()
map
Out[ ]:
Make this Notebook Trusted to load map: File -> Trust Notebook

conclusion: impacts of our study (social, academic). Drawback of whole project + next steps.¶